# The usual preamble
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# Make the graphs a bit prettier, and bigger
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15, 5)
plt.rcParams['font.family'] = 'sans-serif'
# This is necessary to show lots of columns in pandas 0.12.
# Not necessary in pandas 0.13.
pd.set_option('display.width', 5000)
pd.set_option('display.max_columns', 60)
One of the main problems with messy data is: how do you know if it's messy or not?
We're going to use the NYC 311 service request dataset again here, since it's big and a bit unwieldy.
requests = pd.read_csv('../data/311-service-requests.csv', dtype='unicode')
We're going to look at a few columns here. I know already that there are some problems with the zip code, so let's look at that first.
To get a sense for whether a column has problems, I usually use .unique()
to look at all its values. If it's a numeric column, I'll instead plot a histogram to get a sense of the distribution.
When we look at the unique values in "Incident Zip", it quickly becomes clear that this is a mess.
Some of the problems:
nan
s 29616-0759
or 83
What we can do:
requests['Incident Zip'].unique()
array(['11432', '11378', '10032', '10023', '10027', '11372', '11419', '11417', '10011', '11225', '11218', '10003', '10029', '10466', '11219', '10025', '10310', '11236', nan, '10033', '11216', '10016', '10305', '10312', '10026', '10309', '10036', '11433', '11235', '11213', '11379', '11101', '10014', '11231', '11234', '10457', '10459', '10465', '11207', '10002', '10034', '11233', '10453', '10456', '10469', '11374', '11221', '11421', '11215', '10007', '10019', '11205', '11418', '11369', '11249', '10005', '10009', '11211', '11412', '10458', '11229', '10065', '10030', '11222', '10024', '10013', '11420', '11365', '10012', '11214', '11212', '10022', '11232', '11040', '11226', '10281', '11102', '11208', '10001', '10472', '11414', '11223', '10040', '11220', '11373', '11203', '11691', '11356', '10017', '10452', '10280', '11217', '10031', '11201', '11358', '10128', '11423', '10039', '10010', '11209', '10021', '10037', '11413', '11375', '11238', '10473', '11103', '11354', '11361', '11106', '11385', '10463', '10467', '11204', '11237', '11377', '11364', '11434', '11435', '11210', '11228', '11368', '11694', '10464', '11415', '10314', '10301', '10018', '10038', '11105', '11230', '10468', '11104', '10471', '11416', '10075', '11422', '11355', '10028', '10462', '10306', '10461', '11224', '11429', '10035', '11366', '11362', '11206', '10460', '10304', '11360', '11411', '10455', '10475', '10069', '10303', '10308', '10302', '11357', '10470', '11367', '11370', '10454', '10451', '11436', '11426', '10153', '11004', '11428', '11427', '11001', '11363', '10004', '10474', '11430', '10000', '10307', '11239', '10119', '10006', '10048', '11697', '11692', '11693', '10573', '00083', '11559', '10020', '77056', '11776', '70711', '10282', '11109', '10044', '02061', '77092-2016', '14225', '55164-0737', '19711', '07306', '000000', 'NO CLUE', '90010', '11747', '23541', '11788', '07604', '10112', '11563', '11580', '07087', '11042', '07093', '11501', '92123', '00000', '11575', '07109', '11797', '10803', '11716', '11722', '11549-3650', '10162', '23502', '11518', '07020', '08807', '11577', '07114', '11003', '07201', '61702', '10103', '29616-0759', '35209-3114', '11520', '11735', '10129', '11005', '41042', '11590', '06901', '07208', '11530', '13221', '10954', '11111', '10107'], dtype=object)
We can pass a na_values
option to pd.read_csv
to clean this up a little bit. We can also specify that the type of Incident Zip is a string, not a float.
na_values = ['NO CLUE', 'N/A', '0']
requests = pd.read_csv('../data/311-service-requests.csv', na_values=na_values, dtype={'Incident Zip': str})
requests['Incident Zip'].unique()
array(['11432', '11378', '10032', '10023', '10027', '11372', '11419', '11417', '10011', '11225', '11218', '10003', '10029', '10466', '11219', '10025', '10310', '11236', nan, '10033', '11216', '10016', '10305', '10312', '10026', '10309', '10036', '11433', '11235', '11213', '11379', '11101', '10014', '11231', '11234', '10457', '10459', '10465', '11207', '10002', '10034', '11233', '10453', '10456', '10469', '11374', '11221', '11421', '11215', '10007', '10019', '11205', '11418', '11369', '11249', '10005', '10009', '11211', '11412', '10458', '11229', '10065', '10030', '11222', '10024', '10013', '11420', '11365', '10012', '11214', '11212', '10022', '11232', '11040', '11226', '10281', '11102', '11208', '10001', '10472', '11414', '11223', '10040', '11220', '11373', '11203', '11691', '11356', '10017', '10452', '10280', '11217', '10031', '11201', '11358', '10128', '11423', '10039', '10010', '11209', '10021', '10037', '11413', '11375', '11238', '10473', '11103', '11354', '11361', '11106', '11385', '10463', '10467', '11204', '11237', '11377', '11364', '11434', '11435', '11210', '11228', '11368', '11694', '10464', '11415', '10314', '10301', '10018', '10038', '11105', '11230', '10468', '11104', '10471', '11416', '10075', '11422', '11355', '10028', '10462', '10306', '10461', '11224', '11429', '10035', '11366', '11362', '11206', '10460', '10304', '11360', '11411', '10455', '10475', '10069', '10303', '10308', '10302', '11357', '10470', '11367', '11370', '10454', '10451', '11436', '11426', '10153', '11004', '11428', '11427', '11001', '11363', '10004', '10474', '11430', '10000', '10307', '11239', '10119', '10006', '10048', '11697', '11692', '11693', '10573', '00083', '11559', '10020', '77056', '11776', '70711', '10282', '11109', '10044', '02061', '77092-2016', '14225', '55164-0737', '19711', '07306', '000000', '90010', '11747', '23541', '11788', '07604', '10112', '11563', '11580', '07087', '11042', '07093', '11501', '92123', '00000', '11575', '07109', '11797', '10803', '11716', '11722', '11549-3650', '10162', '23502', '11518', '07020', '08807', '11577', '07114', '11003', '07201', '61702', '10103', '29616-0759', '35209-3114', '11520', '11735', '10129', '11005', '41042', '11590', '06901', '07208', '11530', '13221', '10954', '11111', '10107'], dtype=object)
rows_with_dashes = requests['Incident Zip'].str.contains('-').fillna(False)
len(requests[rows_with_dashes])
5
requests[rows_with_dashes]
Unique Key | Created Date | Closed Date | Agency | Agency Name | Complaint Type | Descriptor | Location Type | Incident Zip | Incident Address | Street Name | Cross Street 1 | Cross Street 2 | Intersection Street 1 | Intersection Street 2 | Address Type | City | Landmark | Facility Type | Status | Due Date | Resolution Action Updated Date | Community Board | Borough | X Coordinate (State Plane) | Y Coordinate (State Plane) | Park Facility Name | Park Borough | School Name | School Number | School Region | School Code | School Phone Number | School Address | School City | School State | School Zip | School Not Found | School or Citywide Complaint | Vehicle Type | Taxi Company Borough | Taxi Pick Up Location | Bridge Highway Name | Bridge Highway Direction | Road Ramp | Bridge Highway Segment | Garage Lot Name | Ferry Direction | Ferry Terminal Name | Latitude | Longitude | Location | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
29136 | 26550551 | 10/24/2013 06:16:34 PM | NaN | DCA | Department of Consumer Affairs | Consumer Complaint | False Advertising | NaN | 77092-2016 | 2700 EAST SELTICE WAY | EAST SELTICE WAY | NaN | NaN | NaN | NaN | NaN | HOUSTON | NaN | NaN | Assigned | 11/13/2013 11:15:20 AM | 10/29/2013 11:16:16 AM | 0 Unspecified | Unspecified | NaN | NaN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
30939 | 26548831 | 10/24/2013 09:35:10 AM | NaN | DCA | Department of Consumer Affairs | Consumer Complaint | Harassment | NaN | 55164-0737 | P.O. BOX 64437 | 64437 | NaN | NaN | NaN | NaN | NaN | ST. PAUL | NaN | NaN | Assigned | 11/13/2013 02:30:21 PM | 10/29/2013 02:31:06 PM | 0 Unspecified | Unspecified | NaN | NaN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
70539 | 26488417 | 10/15/2013 03:40:33 PM | NaN | TLC | Taxi and Limousine Commission | Taxi Complaint | Driver Complaint | Street | 11549-3650 | 365 HOFSTRA UNIVERSITY | HOFSTRA UNIVERSITY | NaN | NaN | NaN | NaN | NaN | HEMSTEAD | NaN | NaN | Assigned | 11/30/2013 01:20:33 PM | 10/16/2013 01:21:39 PM | 0 Unspecified | Unspecified | NaN | NaN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | La Guardia Airport | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
85821 | 26468296 | 10/10/2013 12:36:43 PM | 10/26/2013 01:07:07 AM | DCA | Department of Consumer Affairs | Consumer Complaint | Debt Not Owed | NaN | 29616-0759 | PO BOX 25759 | BOX 25759 | NaN | NaN | NaN | NaN | NaN | GREENVILLE | NaN | NaN | Closed | 10/26/2013 09:20:28 AM | 10/26/2013 01:07:07 AM | 0 Unspecified | Unspecified | NaN | NaN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
89304 | 26461137 | 10/09/2013 05:23:46 PM | 10/25/2013 01:06:41 AM | DCA | Department of Consumer Affairs | Consumer Complaint | Harassment | NaN | 35209-3114 | 600 BEACON PKWY | BEACON PKWY | NaN | NaN | NaN | NaN | NaN | BIRMINGHAM | NaN | NaN | Closed | 10/25/2013 02:43:42 PM | 10/25/2013 01:06:41 AM | 0 Unspecified | Unspecified | NaN | NaN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
I thought these were missing data and originally deleted them like this:
requests['Incident Zip'][rows_with_dashes] = np.nan
But then my friend Dave pointed out that 9-digit zip codes are normal. Let's look at all the zip codes with more than 5 digits, make sure they're okay, and then truncate them.
long_zip_codes = requests['Incident Zip'].str.len() > 5
requests['Incident Zip'][long_zip_codes].unique()
array(['77092-2016', '55164-0737', '000000', '11549-3650', '29616-0759', '35209-3114'], dtype=object)
Those all look okay to truncate to me.
requests['Incident Zip'] = requests['Incident Zip'].str.slice(0, 5)
Done.
Earlier I thought 00083 was a broken zip code, but turns out Central Park's zip code 00083! Shows what I know. I'm still concerned about the 00000 zip codes, though: let's look at that.
requests[requests['Incident Zip'] == '00000']
Unique Key | Created Date | Closed Date | Agency | Agency Name | Complaint Type | Descriptor | Location Type | Incident Zip | Incident Address | Street Name | Cross Street 1 | Cross Street 2 | Intersection Street 1 | Intersection Street 2 | Address Type | City | Landmark | Facility Type | Status | Due Date | Resolution Action Updated Date | Community Board | Borough | X Coordinate (State Plane) | Y Coordinate (State Plane) | Park Facility Name | Park Borough | School Name | School Number | School Region | School Code | School Phone Number | School Address | School City | School State | School Zip | School Not Found | School or Citywide Complaint | Vehicle Type | Taxi Company Borough | Taxi Pick Up Location | Bridge Highway Name | Bridge Highway Direction | Road Ramp | Bridge Highway Segment | Garage Lot Name | Ferry Direction | Ferry Terminal Name | Latitude | Longitude | Location | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
42600 | 26529313 | 10/22/2013 02:51:06 PM | NaN | TLC | Taxi and Limousine Commission | Taxi Complaint | Driver Complaint | NaN | 00000 | EWR EWR | EWR | NaN | NaN | NaN | NaN | NaN | NEWARK | NaN | NaN | Assigned | 12/07/2013 09:53:51 AM | 10/23/2013 09:54:43 AM | 0 Unspecified | Unspecified | NaN | NaN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | Other | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
60843 | 26507389 | 10/17/2013 05:48:44 PM | NaN | TLC | Taxi and Limousine Commission | Taxi Complaint | Driver Complaint | Street | 00000 | 1 NEWARK AIRPORT | NEWARK AIRPORT | NaN | NaN | NaN | NaN | NaN | NEWARK | NaN | NaN | Assigned | 12/02/2013 11:59:46 AM | 10/18/2013 12:01:08 PM | 0 Unspecified | Unspecified | NaN | NaN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | Other | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
This looks bad to me. Let's set these to nan.
zero_zips = requests['Incident Zip'] == '00000'
requests.loc[zero_zips, 'Incident Zip'] = np.nan
Great. Let's see where we are now:
unique_zips = requests['Incident Zip'].unique()
unique_zips.sort()
unique_zips
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-124-a216f82ef513> in <module> 1 unique_zips = requests['Incident Zip'].unique() ----> 2 unique_zips.sort() 3 unique_zips TypeError: '<' not supported between instances of 'float' and 'str'
Amazing! This is much cleaner. There's something a bit weird here, though -- I looked up 77056 on Google maps, and that's in Texas.
Let's take a closer look:
zips = requests['Incident Zip']
# Let's say the zips starting with '0' and '1' are okay, for now. (this isn't actually true -- 13221 is in Syracuse, and why?)
is_close = zips.str.startswith('0') | zips.str.startswith('1')
# There are a bunch of NaNs, but we're not interested in them right now, so we'll say they're False
is_far = ~(is_close) & zips.notnull()
zips[is_far]
12102 77056 13450 70711 29136 77092 30939 55164 44008 90010 47048 23541 57636 92123 71001 92123 71834 23502 80573 61702 85821 29616 89304 35209 94201 41042 Name: Incident Zip, dtype: object
requests[is_far][['Incident Zip', 'Descriptor', 'City']].sort_values('Incident Zip')
Incident Zip | Descriptor | City | |
---|---|---|---|
71834 | 23502 | Harassment | NORFOLK |
47048 | 23541 | Harassment | NORFOLK |
85821 | 29616 | Debt Not Owed | GREENVILLE |
89304 | 35209 | Harassment | BIRMINGHAM |
94201 | 41042 | Harassment | FLORENCE |
30939 | 55164 | Harassment | ST. PAUL |
80573 | 61702 | Billing Dispute | BLOOMIGTON |
13450 | 70711 | Contract Dispute | CLIFTON |
12102 | 77056 | Debt Not Owed | HOUSTON |
29136 | 77092 | False Advertising | HOUSTON |
44008 | 90010 | Billing Dispute | LOS ANGELES |
57636 | 92123 | Harassment | SAN DIEGO |
71001 | 92123 | Billing Dispute | SAN DIEGO |
Okay, there really are requests coming from LA and Houston! Good to know. Filtering by zip code is probably a bad way to handle this -- we should really be looking at the city instead.
requests['City'].str.upper().value_counts()
BROOKLYN 31662 NEW YORK 22664 BRONX 18438 STATEN ISLAND 4766 JAMAICA 2246 ... BRIARWOOD 1 PORT JEFFERSON STATION 1 NJ 1 ROSELYN 1 RYEBROOK 1 Name: City, Length: 100, dtype: int64
It looks like these are legitimate complaints, so we'll just leave them alone.
Here's what we ended up doing to clean up our zip codes, all together:
na_values = ['NO CLUE', 'N/A', '0']
requests = pd.read_csv('../data/311-service-requests.csv',
na_values=na_values,
dtype={'Incident Zip': str})
def fix_zip_codes(zips):
# Truncate everything to length 5
zips = zips.str.slice(0, 5)
# Set 00000 zip codes to nan
zero_zips = zips == '00000'
zips[zero_zips] = np.nan
return zips
requests['Incident Zip'] = fix_zip_codes(requests['Incident Zip'])
requests['Incident Zip'].unique()
array(['11432', '11378', '10032', '10023', '10027', '11372', '11419', '11417', '10011', '11225', '11218', '10003', '10029', '10466', '11219', '10025', '10310', '11236', nan, '10033', '11216', '10016', '10305', '10312', '10026', '10309', '10036', '11433', '11235', '11213', '11379', '11101', '10014', '11231', '11234', '10457', '10459', '10465', '11207', '10002', '10034', '11233', '10453', '10456', '10469', '11374', '11221', '11421', '11215', '10007', '10019', '11205', '11418', '11369', '11249', '10005', '10009', '11211', '11412', '10458', '11229', '10065', '10030', '11222', '10024', '10013', '11420', '11365', '10012', '11214', '11212', '10022', '11232', '11040', '11226', '10281', '11102', '11208', '10001', '10472', '11414', '11223', '10040', '11220', '11373', '11203', '11691', '11356', '10017', '10452', '10280', '11217', '10031', '11201', '11358', '10128', '11423', '10039', '10010', '11209', '10021', '10037', '11413', '11375', '11238', '10473', '11103', '11354', '11361', '11106', '11385', '10463', '10467', '11204', '11237', '11377', '11364', '11434', '11435', '11210', '11228', '11368', '11694', '10464', '11415', '10314', '10301', '10018', '10038', '11105', '11230', '10468', '11104', '10471', '11416', '10075', '11422', '11355', '10028', '10462', '10306', '10461', '11224', '11429', '10035', '11366', '11362', '11206', '10460', '10304', '11360', '11411', '10455', '10475', '10069', '10303', '10308', '10302', '11357', '10470', '11367', '11370', '10454', '10451', '11436', '11426', '10153', '11004', '11428', '11427', '11001', '11363', '10004', '10474', '11430', '10000', '10307', '11239', '10119', '10006', '10048', '11697', '11692', '11693', '10573', '00083', '11559', '10020', '77056', '11776', '70711', '10282', '11109', '10044', '02061', '77092', '14225', '55164', '19711', '07306', '90010', '11747', '23541', '11788', '07604', '10112', '11563', '11580', '07087', '11042', '07093', '11501', '92123', '11575', '07109', '11797', '10803', '11716', '11722', '11549', '10162', '23502', '11518', '07020', '08807', '11577', '07114', '11003', '07201', '61702', '10103', '29616', '35209', '11520', '11735', '10129', '11005', '41042', '11590', '06901', '07208', '11530', '13221', '10954', '11111', '10107'], dtype=object)